1.Medal Counts over Time

setwd("/Users/luqi/Desktop")
ath = read.csv("athletes_and_events.csv")
nocregions = read.csv("noc_regions.csv")
gdppop = read.csv("gdp_pop.csv")
#install.packages("tidyverse")
athsummer <- ath[which(ath$Season=='Summer'),]
library(tidyr)
tathsummer = drop_na(athsummer, Medal)
tathsummer= tathsummer[!duplicated(tathsummer), ]
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#group_by(athsummer,NOC, add = FALSE)
cleandata <- tathsummer %>%
  group_by(NOC) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze"))) %>%
  arrange(desc(.$medal_number))
#cleandata
tathsummerUSA <- ath[which(tathsummer$NOC=='USA'),]
cleandataUSA <- tathsummerUSA %>%
  group_by(Year,Sex) %>%
summarize(gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze"))) 
cleandataUSA
## # A tibble: 65 x 5
## # Groups:   Year [35]
##     Year Sex   gold_number silver_number bronze_number
##    <int> <fct>       <int>         <int>         <int>
##  1  1896 M               0             0             0
##  2  1900 M               4             8             2
##  3  1904 M               1             3             1
##  4  1906 M               1             1             3
##  5  1908 F               0             0             0
##  6  1908 M               5             3             4
##  7  1912 M              10             8            10
##  8  1920 F               0             0             1
##  9  1920 M               6             8             8
## 10  1924 F               0             0             0
## # … with 55 more rows
library(ggplot2)
# ggplot(data=cleandataUSA,
#             aes(x=Year,
#                 y=gold_number))+
#   geom_line(aes(group=Sex,
#                   color=Sex))
# look at USA's gold medal number by time trend, considering the gneder of the gold medal winners
p = ggplot(data=cleandataUSA,
            aes(x=Year,
                y=gold_number))
p + geom_line(aes(group=Sex,
                  color=Sex))+
         labs(x="Year",
         y="Gold Medal Number",
         color="Gender") +
   ggtitle("USA's Gold Medal Number by Sex")+ geom_point(size=2,aes(color=Sex))

cleandata_add_Medal <- tathsummer %>%
  group_by(NOC,Medal) %>%
summarize(medal_number = length(Medal))
#cleandata_add_Medal
cleandata_add_Medal_top5 <- cleandata_add_Medal[which(cleandata_add_Medal$NOC == 'USA'|cleandata_add_Medal$NOC =='URS'|cleandata_add_Medal$NOC =='GBR'|cleandata_add_Medal$NOC =='GER'|cleandata_add_Medal$NOC =='FRA'),]
cleandata_add_Medal_top5
## # A tibble: 15 x 3
## # Groups:   NOC [5]
##    NOC   Medal  medal_number
##    <fct> <fct>         <int>
##  1 FRA   Bronze          587
##  2 FRA   Gold            463
##  3 FRA   Silver          567
##  4 GBR   Bronze          620
##  5 GBR   Gold            635
##  6 GBR   Silver          729
##  7 GER   Bronze          649
##  8 GER   Gold            592
##  9 GER   Silver          538
## 10 URS   Bronze          596
## 11 URS   Gold            832
## 12 URS   Silver          635
## 13 USA   Bronze         1197
## 14 USA   Gold           2472
## 15 USA   Silver         1333
g = ggplot(data=cleandata_add_Medal_top5,aes(NOC,fill=Medal))
g + geom_bar(aes(weight=medal_number),position = "dodge")+scale_fill_manual(values=c("darkred", "gold","gray40"))+ggtitle("Top 5 Medal winning NOCs")+labs(x="NOC",
         y="Medal Number") 

# I would recommend the first visualization because we can see some interesting pattern in the graph, as time goes by, in USA, the gap of medal numbers won by male and female is narrowing. 

2.Medal Counts adjusted by Population, GDP

tath = drop_na(ath, Medal)
tath = tath[!duplicated(tath), ]
gdppop=gdppop[!duplicated(gdppop), ]
addvalue<- tath %>%
  group_by(NOC) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze")),medal_value = 3*as.numeric(gold_number) + 2*as.numeric(silver_number)+as.numeric(bronze_number)) %>%
  arrange(desc(.$medal_number))
#addvalue

fulltable= merge(x = addvalue, y = gdppop, by.x = "NOC", by.y = "Code", all = TRUE)
#fulltable
fulltable$medal_value_byGDP = (as.numeric(fulltable$medal_value)/as.numeric(fulltable$GDP.per.Capita))
fulltable$medal_value_byPOP = (as.numeric(fulltable$medal_value)/as.numeric(fulltable$Population))*1000000
fulltable$highlight <- fulltable$NOC == 'CHN'
fulltable <- fulltable %>% 
  mutate(highlight=replace(highlight, highlight == "TRUE", "CHN"), highlight=replace(highlight, highlight == "FALSE", "OTHER"))
Ranking_plot1=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value)) + 
  geom_point(aes(colour = highlight)) + 
  scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+ 
  labs(x="GDP.per.Capita",
         y="Unadjusted Medal Value",
         color="NOC") +
   ggtitle("CHN's Ranking in Unadjusted Medal Value")
Ranking_plot1
## Warning: Removed 99 rows containing missing values (geom_point).

Ranking_plot2=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value_byGDP)) + 
  geom_point(aes(colour = highlight)) + 
  scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+ 
  labs(x="GDP.per.Capita",
         y="Medal Value Adjusted by GDP",
         color="NOC") +
   ggtitle("CHN's Ranking in Adjusted Medal Value(by GDP)")
Ranking_plot2
## Warning: Removed 99 rows containing missing values (geom_point).

Ranking_plot3=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value_byPOP)) + 
  geom_point(aes(colour = highlight)) + 
  scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+
  labs(x="GDP.per.Capita",
         y="Medal Value Adjusted by Population",
         color="NOC") +
   ggtitle("CHN's Ranking in Adjusted Medal Value(by Population)")
Ranking_plot3
## Warning: Removed 99 rows containing missing values (geom_point).

library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(Ranking_plot1, Ranking_plot2, Ranking_plot3,nrow=3)
## Warning: Removed 99 rows containing missing values (geom_point).

## Warning: Removed 99 rows containing missing values (geom_point).

## Warning: Removed 99 rows containing missing values (geom_point).

## From the comparison of 3 plots we can see that CHN ranks top when medal_value is adjusted by GDP, and ranks very low when medal_value is adjusted by population, which shows that CHN has a really large population.

3.Host Country Advantage

library(rvest)
## Loading required package: xml2
library(stringr)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Summer_Olympic_Games") 
hosts <- html_table(html_nodes(wiki_hosts, "table")[[8]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ", ")[,2]
tathsummer_host <- tathsummer[which(tathsummer$NOC =='FRA'|tathsummer$NOC =='USA'|tathsummer$NOC == 'GBR'|tathsummer$NOC =='SWE'|tathsummer$NOC =='BEL'|tathsummer$NOC =='NED'|tathsummer$NOC =='GER'|tathsummer$NOC =='GBR'|tathsummer$NOC =='FIN'|tathsummer$NOC =='ANZ'|tathsummer$NOC =='ITA'|tathsummer$NOC =='JPN'|tathsummer$NOC =='MEX'|tathsummer$NOC =='CAN'|tathsummer$NOC =='URS'|tathsummer$NOC =='KOR'|tathsummer$NOC =='ESP'|tathsummer$NOC =='GRE'|tathsummer$NOC =='CHN'|tathsummer$NOC =='BRA'),]
#tathsummer_host
tathsummer_host_addyear<- tathsummer_host %>%
  group_by(NOC,Year) %>%
summarize(medal_number = length(Medal))
#tathsummer_host_addyear
tathsummer_host_group<- tathsummer_host %>%
  group_by(NOC) %>%
summarize(medal_number = length(Medal))
#tathsummer_host_group
NOC = c("ANZ","ANZ","BEL","BEL","BRA","BRA","CAN","CAN","CHN","CHN","ESP","ESP","FIN","FIN","FRA","FRA","GBR","BGR","GER","GER","GRE","GRE","ITA","ITA","JPN","JPN","KOR","KOR","MEX","MEX","NED","NED","SWE","SWE","URS","URS","USA","USA")
Host = c("true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false")
Yearly_Average_MedalNumber = c(0,1.1,188,9.9,50,15.7,23,26.6,184,26.9,69,15.5,40,16.1,172.5,49.3,247,57.3,224,91.4,39.5,5.6,88,50.3,62,29.2,77,17.6,9,3.7,57,31.9,190,34,442,60,298.5,158.7)
df = data.frame(NOC,Host,Yearly_Average_MedalNumber)
df
##    NOC  Host Yearly_Average_MedalNumber
## 1  ANZ  true                        0.0
## 2  ANZ false                        1.1
## 3  BEL  true                      188.0
## 4  BEL false                        9.9
## 5  BRA  true                       50.0
## 6  BRA false                       15.7
## 7  CAN  true                       23.0
## 8  CAN false                       26.6
## 9  CHN  true                      184.0
## 10 CHN false                       26.9
## 11 ESP  true                       69.0
## 12 ESP false                       15.5
## 13 FIN  true                       40.0
## 14 FIN false                       16.1
## 15 FRA  true                      172.5
## 16 FRA false                       49.3
## 17 GBR  true                      247.0
## 18 BGR false                       57.3
## 19 GER  true                      224.0
## 20 GER false                       91.4
## 21 GRE  true                       39.5
## 22 GRE false                        5.6
## 23 ITA  true                       88.0
## 24 ITA false                       50.3
## 25 JPN  true                       62.0
## 26 JPN false                       29.2
## 27 KOR  true                       77.0
## 28 KOR false                       17.6
## 29 MEX  true                        9.0
## 30 MEX false                        3.7
## 31 NED  true                       57.0
## 32 NED false                       31.9
## 33 SWE  true                      190.0
## 34 SWE false                       34.0
## 35 URS  true                      442.0
## 36 URS false                       60.0
## 37 USA  true                      298.5
## 38 USA false                      158.7
Adv_plot=ggplot(df, aes(x = NOC, y = Yearly_Average_MedalNumber )) + 
  geom_point(aes(colour = Host)) + 
   ggtitle("Host Country Advantage")
Adv_plot

4. Most successful athletes

namedata <- tath %>%
  group_by(Name,Medal,Sex) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze"))) %>%
  arrange(desc(.$medal_number))
#namedata
## I define "most successful athletes"as athletes who won most gold medals, let's look at top 10 gold medal winners and see the gender distribution
namedata_top10 = namedata[1:10,]
namedata_top10
## # A tibble: 10 x 7
## # Groups:   Name, Medal [10]
##    Name    Medal Sex   medal_number gold_number silver_number bronze_number
##    <fct>   <fct> <fct>        <int>       <int>         <int>         <int>
##  1 Michae… Gold  M               23          23             0             0
##  2 "Raymo… Gold  M               10          10             0             0
##  3 "Frede… Gold  M                9           9             0             0
##  4 Larysa… Gold  F                9           9             0             0
##  5 Mark A… Gold  M                9           9             0             0
##  6 Paavo … Gold  M                9           9             0             0
##  7 Birgit… Gold  F                8           8             0             0
##  8 "Jenni… Gold  F                8           8             0             0
##  9 "Matth… Gold  M                8           8             0             0
## 10 Ole Ei… Gold  M                8           8             0             0
g2 = ggplot(data=namedata_top10,aes(Name,fill=Sex))
g2 + geom_bar(aes(weight=gold_number),position = position_stack(reverse = TRUE))+coord_flip()+ggtitle("Top 10 Gold-Medal winning Atheletes")+labs(x="Name",
         y="Gold Medal Number") +theme(legend.position = "top")

namedata_addsport_year <- tath %>%
  group_by(Name,Sport,Year) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold"))) %>%
 arrange(desc(.$gold_number))
#namedata_addsport_year
namedata_addsport_year_top10 <- namedata_addsport_year[which(namedata_addsport_year$Name == 'Michael Fred Phelps, II'|namedata_addsport_year$Name == 'Raymond Clarence "Ray" Ewry'|namedata_addsport_year$Name == 'Frederick Carlton "Carl" Lewis'|namedata_addsport_year$Name == 'Larysa Semenivna Latynina (Diriy-)'|namedata_addsport_year$Name == 'Mark Andrew Spitz'|namedata_addsport_year$Name == 'Paavo Johannes Nurmi'|namedata_addsport_year$Name == 'Birgit Fischer-Schmidt'|namedata_addsport_year$Name == 'Jennifer Elisabeth "Jenny" Thompson (-Cumpelik)'|namedata_addsport_year$Name == 'Matthew Nicholas "Matt" Biondi'|namedata_addsport_year$Name == 'Ole Einar Bjrndalen'),]
p2 <- ggplot(data=namedata_addsport_year_top10,
            aes(x=Year,
                y=gold_number))
p2 + geom_line(aes(group=Name,
                  color=Sport)) +
    labs(x="Year",
         y="Gold Medal Number",
         color="Sport") + geom_point(size=4,aes(color=Sport)) + ggtitle("Top 10 Gold-Medal winning Atheletes' Sports")

## For this graph, every tiny line is an athlete, my interesting finding is that before 1925, top athletes are crazy about winning medals in Atheletics, and later on, the popular sport became swimming

5.Make two plots interactive

library(devtools)
devtools::install_github("ropensci/plotly",force=TRUE)
## Downloading GitHub repo ropensci/plotly@master
## from URL https://api.github.com/repos/ropensci/plotly/zipball/master
## Installing plotly
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore --quiet CMD INSTALL  \
##   '/private/var/folders/rx/w35t6lg902l19fdtv5wqjfgr0000gn/T/Rtmpqu1upQ/devtools7c0319d4b1f/ropensci-plotly-c05f001'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.5/Resources/library'  \
##   --install-tests
## 
R.home(component = "home")
## [1] "/Library/Frameworks/R.framework/Resources"
#install.packages("usethis")
library(usethis)
## 
## Attaching package: 'usethis'
## The following objects are masked from 'package:devtools':
## 
##     use_appveyor, use_build_ignore, use_code_of_conduct,
##     use_coverage, use_cran_badge, use_cran_comments, use_data,
##     use_data_raw, use_dev_version, use_git, use_git_hook,
##     use_github, use_github_links, use_gpl3_license,
##     use_mit_license, use_news_md, use_package, use_package_doc,
##     use_rcpp, use_readme_md, use_readme_rmd, use_revdep,
##     use_rstudio, use_test, use_testthat, use_travis, use_vignette
usethis::edit_r_environ()
## ● Edit /Users/luqi/.Renviron
## ● Restart R for changes to take effect
Sys.setenv("plotly_username"="luqi.chen")
Sys.setenv("plotly_api_key"="R9LBpABPVy7aUJ3Sx7jf")
interation_1=p + geom_line(aes(group=Sex,
                  color=Sex))+
         labs(x="Year",
         y="Gold Medal Number",
         color="Gender") +
   ggtitle("USA's Gold Medal Number by Sex")+ geom_point(size=2,aes(color=Sex))
#install.packages('plotly')
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(interation_1)
## In this interation graph, readers can easily get the statistic of gold number and year by pointing to each spot.
interation_2 = g + geom_bar(aes(weight=medal_number),position = "dodge")+scale_fill_manual(values=c("darkred", "gold","gray40"))+ggtitle("Top 5 Medal winning NOCs")+labs(x="NOC",
         y="Medal Number") 
ggplotly(interation_2)
# In this interation graph, readers can easily read the specific number of medals by pointing to each bar.

6.Data Table

#install.packages('DT')
library(DT)
datatable(cleandata_add_Medal_top5)
library(stringr)
pretty_headers <- 
  gsub("[.]", " ", colnames(cleandata_add_Medal_top5)) %>%
  str_to_title()
cleandata_add_Medal_top5 %>%
  datatable(
    rownames = FALSE,
    colnames = pretty_headers,
    filter = list(position = "top"),
    options = list(language = list(sSearch = "Filter:"))
  )
## In this datatable, I can provide the medal information for a particular NOC(by using the column filter of Noc), I can also provide how gold medals are distributed in the top 5 Medal Winning NOCs(by using the column filter of Medal)